#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h>
#include <thrust/transform_reduce.h>

#include <cmath>
#include <iostream>

#include "include/host_device.h"

// The technique demonstrated in the example monte_carlo.cu
// assigns an independently seeded random number generator to each
// of 30K threads, and uses a hashing scheme based on thread index to
// seed each RNG. This technique, while simple, may be succeptible
// to correlation among the streams of numbers generated by each RNG
// because there is no guarantee that the streams are not disjoint.
// This example demonstrates a slightly more sophisticated technique
// which ensures that the subsequences generated in each thread are
// disjoint. To achieve this, we use a single common stream
// of random numbers, but partition it among threads to ensure no overlap
// of substreams. The substreams are generated procedurally using
// default_random_engine's discard(n) member function, which skips
// past n states of the RNG. This function is accelerated and executes
// in O(lg n) time.

struct estimate_pi
{
  __host__ __device__ float operator()(unsigned int thread_id)
  {
    float sum      = 0;
    unsigned int N = 5000; // samples per stream

    // note that M * N <= default_random_engine::max,
    // which is also the period of this particular RNG
    // this ensures the substreams are disjoint

    // create a random number generator
    // note that each thread uses an RNG with the same seed
    thrust::default_random_engine rng;

    // jump past the numbers used by the subsequences before me
    rng.discard(N * thread_id);

    // create a mapping from random numbers to [0,1)
    thrust::uniform_real_distribution<float> u01(0, 1);

    // take N samples in a quarter circle
    for (unsigned int i = 0; i < N; ++i)
    {
      // draw a sample from the unit square
      float x = u01(rng);
      float y = u01(rng);

      // measure distance from the origin
      float dist = sqrtf(x * x + y * y);

      // add 1.0f if (u0,u1) is inside the quarter circle
      if (dist <= 1.0f)
      {
        sum += 1.0f;
      }
    }

    // multiply by 4 to get the area of the whole circle
    sum *= 4.0f;

    // divide by N
    return sum / N;
  }
};

int main()
{
  // use 30K subsequences of random numbers
  int M = 30000;

  float estimate = thrust::transform_reduce(
    thrust::counting_iterator<int>(0),
    thrust::counting_iterator<int>(M),
    estimate_pi(),
    0.0f,
    ::cuda::std::plus<float>());
  estimate /= M;

  std::cout << "pi is around " << estimate << std::endl;

  return 0;
}
